knitr::opts_chunk$set(message = FALSE, warning = FALSE)
library(bslib)
library(dplyr)
library(DT)
library(ggplot2)
library(glue)
library(here)
library(knitr)
library(lubridate)
library(plotly)
library(purrr)
library(readr)
library(rlang)
library(scales)
library(shiny)
library(stringr)
library(tidyr)
theme_set(theme_bw())

bytes_to_tb <- function(x) {
  return(x / (1024^4))
}

filter_users <- function(dat, usercol = username) {
  non_people <- c("allusers", "rpcuser", "slurm")
  dat %>%
    filter(
      !({{ usercol }} %in% non_people), # not actual people
      !str_detect({{ usercol }}, "[0-9]") # entirely numeric usernames
    )
}

n_top_users <- params$n_top_users
input_dir <- params$input_dir # here("data")
aggregated_filetypes <- c("blamematrix", "catalog", "mimeo")
# TODO: only load last N weeks of data to keep RAM usage reasonably low
user_dat <- tibble(filename = list.dirs(input_dir) %>%
  Filter(function(x) {
    x != input_dir
  }, .) %>%
  lapply(function(x) {
    list.files(x, full.names = TRUE)
  }) %>%
  unlist()) %>%
  filter(!str_detect(filename, paste(aggregated_filetypes, collapse = "|"))) %>%
  separate_wider_delim(filename,
    delim = ".", cols_remove = FALSE,
    names = c("date", "path", "username", "file", "ext"),
    too_few = "debug"
  ) %>%
  mutate(date = as_date(basename(date)))

dates <- user_dat %>%
  filter(!is.na(date)) %>%
  pull(date) %>%
  unique()
most_recent_date <- dates %>% max()

total_usage_tb <- user_dat %>%
  filter(
    username == "allusers",
    date == most_recent_date,
    file == "summary",
    path == "_data_CCBR"
  ) %>%
  pull(filename) %>%
  read_tsv() %>%
  filter(FolderPath == "/data/CCBR") %>%
  mutate(disk_usage_tb = bytes_to_tb(TotalBytes)) %>%
  pull(disk_usage_tb)
# TODO disk_usage_tb doesn't agree with output from `df`

grubbers_summary <- user_dat %>%
  filter(
    username == "allusers",
    date == most_recent_date,
    file == "grubbers",
    ext == "txt",
    path == "_data_CCBR"
  ) %>%
  pull(filename) %>%
  read_tsv()

user_dat <- user_dat %>% filter_users()
usernames <- user_dat %>%
  pull(username) %>%
  unique()

Total disk usage

disk_usage <- read_tsv(here("results", "disk_usage.txt"))
df_date <- disk_usage %>%
  pull("date") %>%
  as_date()

layout_column_wrap(
  width = 1/2,
  value_box(
    title = markdown('Disk space in `/data/CCBR`'),
    value = markdown(disk_usage %>%
                     mutate(Usage = glue("{Used} / {Size}")) %>% 
                     select(Usage, `Use%`) %>% 
                     kable()
                     )
  ),
  value_box(
    title = markdown("Users"),
    value = markdown(glue("{length(usernames)} users as of\n{df_date}"))
  )
)

Disk space in /data/CCBR

Usage Use%
199T / 200T 100%

Users

34 users as of 2023-10-16

Most recent summary (2023-10-09)

Usage by top users for each spacesavers metric.

summary_dat_recent <- user_dat %>%
  filter(
    date == most_recent_date, file == "summary"
  ) %>%
  pull(filename) %>%
  map(function(x) {
    read_tsv(x) %>% mutate(filename = x)
  }) %>%
  list_rbind() %>%
  separate_wider_delim(filename,
    delim = ".", cols_remove = FALSE,
    names = c("basepath", "path", "username", "file", "ext")
  )

summary_metrics <- summary_dat_recent %>%
  pivot_longer(where(is.numeric), names_to = "metric") %>%
  pull(metric) %>%
  unique()

is_large_range <- function(x, n_orders_magnitude = 5) {
  xrange <- range(x)
  return((xrange[2] - xrange[1]) >= 10^n_orders_magnitude)
}

plot_user_metric <- function(dat, x_metric) {
  dat %>%
    ggplot(aes(
      x = eval_tidy(data_sym(x_metric)),
      y = username,
      fill = eval_tidy(data_sym(x_metric)),
      text = glue("{username}\n{x_metric}\n{FolderPath}")
    )) +
    geom_col() +
    # TODO: ggplotly doesn't know what to do with scale::label_log
    # {if (is_large_range(dat %>% pull(x_metric))) scale_x_log10(labels = label_log(digits = 2)) } +
    labs(x = x_metric, y = "") +
    theme(legend.position = "none")
}

panel_summary <- function(dat,
                          folder_path = "/data/CCBR",
                          plot_fcn = plot_metric_time) {
  summary_dat_folder <- dat %>%
    filter(FolderPath == folder_path)
  top_users <- summary_dat_folder %>%
    pivot_longer(all_of(summary_metrics),
      names_to = "metric"
    ) %>%
    mutate(value_adj = case_when(
      metric == "OverallScore" ~ -value,
      TRUE ~ value
    )) %>%
    group_by(metric) %>%
    slice_max(order_by = value_adj, n = n_top_users) %>%
    pull(username) %>%
    unique()
  plots <- summary_metrics %>% lapply(function(y_metric) {
    user_order <- summary_dat_folder %>%
      filter(username %in% top_users) %>%
      pivot_longer(all_of(summary_metrics),
        names_to = "metric"
      ) %>%
      mutate(value_adj = case_when(
        metric == "OverallScore" ~ -value,
        TRUE ~ value
      )) %>%
      filter(metric == y_metric) %>%
      arrange(by = value_adj) %>%
      pull(username) %>%
      unique()
    p <- summary_dat_folder %>%
      filter(username %in% user_order) %>%
      mutate(username = factor(username, levels = user_order)) %>%
      plot_fcn(y_metric)
    nav_panel(title = y_metric, card_header(y_metric), ggplotly(p, tooltip = "text"))
  })
  nav_panel(
    title = markdown(glue("`{folder_path}`")),
    navset_pill_list(!!!plots)
  )
}

navset_tab(
  summary_dat_recent %>% panel_summary("/data/CCBR", plot_user_metric),
  summary_dat_recent %>% panel_summary("/data/CCBR/rawdata", plot_user_metric),
  summary_dat_recent %>% panel_summary("/data/CCBR/projects", plot_user_metric),
)
TotalBytes
DuplicateBytes
PercentDuplicateBytes
TotalFiles
DuplicateFiles
PercentDuplicateFiles
TotalMeanAge
DuplicateMeanAge
AgeScore
DupScore
OccScore
OverallScore
TotalBytes
DuplicateBytes
PercentDuplicateBytes
TotalFiles
DuplicateFiles
PercentDuplicateFiles
TotalMeanAge
DuplicateMeanAge
AgeScore
DupScore
OccScore
OverallScore
TotalBytes
DuplicateBytes
PercentDuplicateBytes
TotalFiles
DuplicateFiles
PercentDuplicateFiles
TotalMeanAge
DuplicateMeanAge
AgeScore
DupScore
OccScore
OverallScore

Summary over time

Usage by top users for each spacesavers metric.

summary_dat_all <- user_dat %>%
  filter(
    file == "summary"
  ) %>%
  pull(filename) %>%
  map(function(x) {
    read_tsv(x) %>% mutate(filename = x)
  }) %>%
  list_rbind() %>%
  separate_wider_delim(filename,
    delim = ".", cols_remove = FALSE,
    names = c("basepath", "path", "username", "file", "ext")
  ) %>%
  mutate(date = str_replace(basepath, ".*/", "") %>% as_date())

plot_metric_time <- function(dat, y_metric) {
  dat %>%
    ggplot(aes(
      x = date,
      y = eval_tidy(data_sym(y_metric)),
      color = username
    )) +
    geom_line(alpha = 0.7) +
    geom_point(aes(text = glue("{username}\n{y_metric}\n{FolderPath}\n{date}"))) +
    labs(y = y_metric)
}

navset_tab(
  summary_dat_all %>% panel_summary("/data/CCBR", plot_metric_time),
  summary_dat_all %>% panel_summary("/data/CCBR/rawdata", plot_metric_time),
  summary_dat_all %>% panel_summary("/data/CCBR/projects", plot_metric_time),
)
TotalBytes
DuplicateBytes
PercentDuplicateBytes
TotalFiles
DuplicateFiles
PercentDuplicateFiles
TotalMeanAge
DuplicateMeanAge
AgeScore
DupScore
OccScore
OverallScore
TotalBytes
DuplicateBytes
PercentDuplicateBytes
TotalFiles
DuplicateFiles
PercentDuplicateFiles
TotalMeanAge
DuplicateMeanAge
AgeScore
DupScore
OccScore
OverallScore
TotalBytes
DuplicateBytes
PercentDuplicateBytes
TotalFiles
DuplicateFiles
PercentDuplicateFiles
TotalMeanAge
DuplicateMeanAge
AgeScore
DupScore
OccScore
OverallScore

High value duplicate files (2023-10-09)

grub_dat <- user_dat %>%
  filter_users() %>%
  filter(!is.na(date), file == "grubbers", ext == "tsv", path == "_data_CCBR") %>%
  slice_max(order_by = date) %>%
  pull(filename) %>%
  map(function(x) {
    read_tsv(x, col_names = FALSE) %>%
      mutate(filename = x)
  }) %>%
  list_rbind() %>%
  rename(
    file_hash = X1,
    file_count = X2,
    total_disk_usage = X3,
    single_disk_usage = X4,
    filepaths = X5
  ) %>%
  separate_wider_delim(filename,
    delim = ".", cols_remove = FALSE,
    names = c("date", "path", "username", "file", "ext")
  ) %>%
  mutate(date = as_date(basename(date))) %>%
  filter_users() %>%
  separate_wider_delim(total_disk_usage,
    delim = " ",
    names = c("total_disk_usage_value", "total_disk_usage_unit"),
    cols_remove = FALSE
  ) %>%
  separate_wider_delim(single_disk_usage,
    delim = " ",
    names = c("single_disk_usage_value", "single_disk_usage_unit"),
    cols_remove = FALSE
  ) %>%
  mutate(across(all_of(c("total_disk_usage_value", "single_disk_usage_value")), as.numeric))

grub_dat %>% write_tsv(here("results", glue("grub-dat_{today()}.tsv")))

top_files <- grub_dat %>%
  arrange(order_by = desc(total_disk_usage_value)) %>%
  select(total_disk_usage_value, username, filepaths) %>%
  rename(disk_usage_gb = total_disk_usage_value)

card(card_header("Top files"), datatable(top_files, fillContainer = TRUE))
Top files